F1 Races Results dataset 1950 to 2024¶

importing libraries¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import bar_chart_race as bcr
import os
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
In [2]:
os.getcwd()
Out[2]:
'C:\\Users\\aksha\\Jupyter\\Project'
In [3]:
os.chdir ('D:\$TUDY\F1')
os.getcwd()
Out[3]:
'D:\\$TUDY\\F1'
In [4]:
drivers = pd.read_csv('drivers_updated.csv')
laps = pd.read_csv('fastest_laps_updated.csv')
teams = pd.read_csv('teams_updated.csv')
win = pd.read_csv('winners.csv')

Data Loading & Data Cleaning¶

Drivers¶

In [5]:
drivers.head()
Out[5]:
Pos Driver Nationality Car PTS year Code
0 1 Nino Farina ITA Alfa Romeo 30.0 1950 FAR
1 2 Juan Manuel Fangio ARG Alfa Romeo 27.0 1950 FAN
2 3 Luigi Fagioli ITA Alfa Romeo 24.0 1950 FAG
3 4 Louis Rosier FRA Talbot-Lago 13.0 1950 ROS
4 5 Alberto Ascari ITA Ferrari 11.0 1950 ASC
In [6]:
drivers.describe()
Out[6]:
PTS year
count 1661.000000 1661.000000
mean 31.138170 1987.124624
std 60.446033 21.849750
min 0.000000 1950.000000
25% 3.000000 1968.000000
50% 9.000000 1987.000000
75% 32.000000 2006.000000
max 575.000000 2024.000000
In [7]:
# filling missing values
drivers['Car'].fillna({"Unknown": 0}, inplace=True)
In [8]:
drivers.shape
Out[8]:
(1661, 7)
In [9]:
drivers.shape
Out[9]:
(1661, 7)
In [10]:
drivers.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1661 entries, 0 to 1660
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Pos          1661 non-null   object 
 1   Driver       1661 non-null   object 
 2   Nationality  1661 non-null   object 
 3   Car          1650 non-null   object 
 4   PTS          1661 non-null   float64
 5   year         1661 non-null   int64  
 6   Code         1661 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 91.0+ KB
In [11]:
drivers.isnull().sum()
Out[11]:
Pos             0
Driver          0
Nationality     0
Car            11
PTS             0
year            0
Code            0
dtype: int64
In [12]:
points = drivers.groupby('Driver').agg(total=('PTS', sum)).reset_index()
points = points.sort_values('total', ascending=False).head(20)
points
Out[12]:
Driver total
215 Lewis Hamilton 4681.5
348 Sebastian Vettel 3098.0
247 Max Verstappen 2755.5
102 Fernando Alonso 2300.0
210 Kimi Räikkönen 1873.0
266 Nico Rosberg 1594.5
353 Sergio Perez 1593.0
249 Michael Schumacher 1566.0
63 Daniel Ricciardo 1322.0
166 Jenson Button 1235.0
47 Charles Leclerc 1212.0
100 Felipe Massa 1167.0
44 Carlos Sainz 1090.5
382 Valtteri Bottas 1081.0
239 Mark Webber 1047.5
2 Alain Prost 768.5
212 Lando Norris 746.0
383 Valtteri Bottas 716.0
341 Rubens Barrichello 658.0
24 Ayrton Senna 610.0

Teams¶

In [13]:
teams.head()
Out[13]:
Pos Team PTS year
0 1 Vanwall 48.0 1958
1 2 Ferrari 40.0 1958
2 3 Cooper Climax 31.0 1958
3 4 BRM 18.0 1958
4 5 Maserati 6.0 1958
In [14]:
teams.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 695 entries, 0 to 694
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pos     695 non-null    object 
 1   Team    695 non-null    object 
 2   PTS     695 non-null    float64
 3   year    695 non-null    int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 21.8+ KB
In [15]:
teams.isnull().sum()
Out[15]:
Pos     0
Team    0
PTS     0
year    0
dtype: int64
In [16]:
team_points = teams.groupby('Team').agg(total=('PTS', sum)).reset_index()
team_points = team_points.sort_values('total', ascending=False).head(20)
team_points
Out[16]:
Team total
47 Ferrari 9877.0
111 Mercedes 7318.5
105 McLaren Mercedes 4018.0
134 Red Bull Racing Renault 2298.0
137 Renault 1777.0
131 Red Bull Racing Honda 1321.5
135 Red Bull Racing TAG Heuer 1255.0
175 Williams Renault 1200.0
132 Red Bull Racing Honda RBPT 1136.0
53 Force India Mercedes 1039.0
174 Williams Mercedes 867.0
104 McLaren Honda 832.0
91 Lotus Renault 815.0
133 Red Bull Racing RBPT 759.0
103 McLaren Ford 755.0
86 Lotus Ford 745.0
129 RBR Renault 651.5
160 Tyrrell Ford 560.0
168 Williams BMW 506.0
24 Benetton Ford 481.5

Winners¶

In [17]:
win.head()
Out[17]:
Grand Prix Date Winner Car Laps Time Name Code year month hours minutes seconds Unnamed: 12 Unnamed: 13
0 Great Britain 13-05-1950 Nino Farina Alfa Romeo 70.0 13:23.6 FAR 1950 5 2 13 24 NaN NaN
1 Monaco 21-05-1950 Juan Manuel Fangio Alfa Romeo 100.0 13:18.7 FAN 1950 5 3 13 19 NaN NaN
2 Indianapolis 500 30-05-1950 Johnnie Parsons Kurtis Kraft Offenhauser 138.0 46:56.0 PAR 1950 5 2 46 56 NaN NaN
3 Switzerland 04-06-1950 Nino Farina Alfa Romeo 42.0 02:53.7 FAR 1950 6 2 2 54 NaN NaN
4 Belgium 18-06-1950 Juan Manuel Fangio Alfa Romeo 35.0 47:26.0 FAN 1950 6 2 47 26 NaN NaN
In [18]:
win.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1110 entries, 0 to 1109
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Grand Prix   1110 non-null   object 
 1   Date         1110 non-null   object 
 2   Winner       1110 non-null   object 
 3   Car          1110 non-null   object 
 4   Laps         1107 non-null   float64
 5   Time         1107 non-null   object 
 6   Name Code    1110 non-null   object 
 7   year         1110 non-null   int64  
 8   month        1110 non-null   int64  
 9   hours        1110 non-null   int64  
 10  minutes      1110 non-null   int64  
 11  seconds      1110 non-null   int64  
 12  Unnamed: 12  0 non-null      float64
 13  Unnamed: 13  1 non-null      object 
dtypes: float64(2), int64(5), object(7)
memory usage: 121.5+ KB
In [19]:
win.drop_duplicates(inplace=True)
In [20]:
win.isnull().sum()
Out[20]:
Grand Prix        0
Date              0
Winner            0
Car               0
Laps              3
Time              3
Name Code         0
year              0
month             0
hours             0
minutes           0
seconds           0
Unnamed: 12    1110
Unnamed: 13    1109
dtype: int64
In [21]:
win['Laps'].fillna(0, inplace=True)
win['Time'].fillna('0', inplace=True)
In [22]:
from dateutil.parser import parse
win['Date'] = win['Date'].apply(parse)
In [23]:
win['Total Time in Seconds'] = win['hours'] * 3600 + win['minutes'] * 60 + win['seconds']
win.head()
Out[23]:
Grand Prix Date Winner Car Laps Time Name Code year month hours minutes seconds Unnamed: 12 Unnamed: 13 Total Time in Seconds
0 Great Britain 1950-05-13 Nino Farina Alfa Romeo 70.0 13:23.6 FAR 1950 5 2 13 24 NaN NaN 8004
1 Monaco 1950-05-21 Juan Manuel Fangio Alfa Romeo 100.0 13:18.7 FAN 1950 5 3 13 19 NaN NaN 11599
2 Indianapolis 500 1950-05-30 Johnnie Parsons Kurtis Kraft Offenhauser 138.0 46:56.0 PAR 1950 5 2 46 56 NaN NaN 10016
3 Switzerland 1950-04-06 Nino Farina Alfa Romeo 42.0 02:53.7 FAR 1950 6 2 2 54 NaN NaN 7374
4 Belgium 1950-06-18 Juan Manuel Fangio Alfa Romeo 35.0 47:26.0 FAN 1950 6 2 47 26 NaN NaN 10046
In [24]:
win['Average Time per lap'] = win['Total Time in Seconds']/win['Laps']
win.head()
Out[24]:
Grand Prix Date Winner Car Laps Time Name Code year month hours minutes seconds Unnamed: 12 Unnamed: 13 Total Time in Seconds Average Time per lap
0 Great Britain 1950-05-13 Nino Farina Alfa Romeo 70.0 13:23.6 FAR 1950 5 2 13 24 NaN NaN 8004 114.342857
1 Monaco 1950-05-21 Juan Manuel Fangio Alfa Romeo 100.0 13:18.7 FAN 1950 5 3 13 19 NaN NaN 11599 115.990000
2 Indianapolis 500 1950-05-30 Johnnie Parsons Kurtis Kraft Offenhauser 138.0 46:56.0 PAR 1950 5 2 46 56 NaN NaN 10016 72.579710
3 Switzerland 1950-04-06 Nino Farina Alfa Romeo 42.0 02:53.7 FAR 1950 6 2 2 54 NaN NaN 7374 175.571429
4 Belgium 1950-06-18 Juan Manuel Fangio Alfa Romeo 35.0 47:26.0 FAN 1950 6 2 47 26 NaN NaN 10046 287.028571

Racing Nations: A World of Formula 1 Drivers¶

In [25]:
import geopandas as gpd
import plotly.express as px

# Step 2: Count the number of drivers by nationality
driver_counts = drivers['Nationality'].value_counts().reset_index()
driver_counts.columns = ['Nationality', 'Count']

# Step 3: Map nationalities to country names
nationality_to_country = {
    'BRA': 'Brazil',
    'SWE': 'Sweden',
    'GBR': 'United Kingdom',
    'ITA': 'Italy',
    'FRA': 'France',
    'GER': 'Germany',
    'USA': 'United States',
    'AUS': 'Australia',
    'CAN': 'Canada',
    'JPN': 'Japan',
    'ESP': 'Spain',
    'NED': 'Netherlands',
    'ARG': 'Argentina',
    'FIN': 'Finland',
    'AUT': 'Austria',
    'NZL': 'New Zealand',
    'BEL': 'Belgium',
    'SUI': 'Switzerland',
    'MEX': 'Mexico',
    'RSA': 'South Africa',
    'DEN': 'Denmark',
    'RUS': 'Russia',
    'MON': 'Monaco',
    'POL': 'Poland',
    'THA': 'Thailand',
    'VEN': 'Venezuela',
    'COL': 'Colombia',
    'IND': 'India',
    'POR': 'Portugal',
    'IRL': 'Ireland',
    'RHO': 'Rhodesia',
    'CHN': 'China',
    'HUN': 'Hungary',
    'CHI': 'Chile',
    'MAS': 'Malaysia',
    'INA': 'Indonesia',
    'RAF': 'French Equatorial Africa'
    # Add more mappings as required
}

# Apply the mapping to convert nationalities to country names
driver_counts['Country'] = driver_counts['Nationality'].map(nationality_to_country)

# Load the world map
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# Merge driver counts with the world map
world = world.merge(driver_counts, how="left", left_on="name", right_on="Country")

# Plot the map
fig = px.choropleth(world,
                    locations='iso_a3',
                    color='Count',
                    hover_name='Country',
                    hover_data=['Count'],
                    projection='natural earth',
                    title='Racing Nations: A World of Formula 1 Drivers',
                    color_continuous_scale='Oranges')  # Change the color scale here

# Customize the map style
fig.update_geos(
    visible=False,
    showcountries=True,
    countrycolor="White",
    coastlinecolor="White",
    showland=True,
    landcolor="LightGrey",
    showocean=True,
    oceancolor="LightBlue",
    showlakes=True,
    lakecolor="LightBlue",
    showrivers=True,
    rivercolor="LightBlue"
)

# Add subtitle and data source
fig.update_layout(
    title={
        'text': "Racing Nations: A World of Formula 1 Drivers",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    annotations=[
        dict(
            text="Data Source: Official site of Formula1 (https://www.formula1.com/)",
            showarrow=False,
            xref="paper",
            yref="paper",
            x=0,
            y=-0.1
        ),
        dict(
            text="Number of Drivers by Nationality Worldwide",
            xref="paper",
            yref="paper",
            x=0.5,
            y=-0.25,
            showarrow=False,
            font=dict(
                family="Arial",
                size=12,
                color="grey"
            )
        )
    ]
)

# Show the plot
fig.show()

Formula 1 Drivers: A Global Tapestry¶

In [26]:
# Group by 'Car' and sum the points for each car
car_performance = drivers.groupby('Car')['PTS'].sum().reset_index()

# Sort the DataFrame by total points in descending order and select top 25 distinct cars
top_25_cars = car_performance.sort_values(by='PTS', ascending=False).head(25)

# Create the scatter plot
fig = px.scatter(top_25_cars,
                 x='Car',
                 y='PTS',
                 hover_data=top_25_cars.columns,
                 title='Top 25 Distinct Cars Based on Performance',
                 labels={'PTS': 'Total Points', 'Car': 'Car Name'},
                 template='plotly_dark')

# Customize marker symbol and size
fig.update_traces(marker=dict(symbol='hexagram', size=12, color='orange', line=dict(width=1, color='black')),
                  selector=dict(mode='markers'))

# Adjust axis labels and title
fig.update_layout(
    xaxis=dict(
        title='Car Name',
        showgrid=True,
        gridcolor='rgba(100, 100, 100, 0.5)',  # Darker shade of gray
        tickangle=45,
        tickfont=dict(size=10, family="Helvetica, Arial, sans-serif")
    ),
    yaxis=dict(
        title='Total Points',
        showgrid=True,
        gridcolor='rgba(100, 100, 100, 0.5)'  # Darker shade of gray
    ),
    title={
        'text': "The Pursuit of Victory: Top 25 Cars Based on Performance",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    annotations=[
        dict(
            text="Visualization of total points scored by top 25 distinct cars",
            showarrow=False,
            xref="paper",
            yref="paper",
            x=0,
            y=-10
        )
    ]
)

# Adjust margin and spacing
fig.update_layout(
    margin=dict(l=50, r=50, t=100, b=50),
    height=600,
    width=1100,
    font=dict(family="Helvetica, Arial, sans-serif", size=12)
)

# Show the plot
fig.show()

Data Visualization¶

In [27]:
fig = px.histogram(points, x='Driver',y='total', 
                   color='total')
fig.update_layout(
    title='Drivers score ranking',
    xaxis_title='Drivers',
    yaxis_title='Total',
    font={'color':'Black', 'size':18,'family':'Courrier New'}
    )
fig.show()
In [28]:
fig2 = px.histogram(team_points, x='Team',y='total', 
                   color='total')
fig2.update_layout(
    title='Teams score ranking',
    xaxis_title='Teams',
    yaxis_title='Total',
    font={'color':'Black', 'size':18,'family':'Courrier New'}
    )
fig2.show()
In [29]:
leaderboard = drivers.groupby('Driver').sum('PTS').sort_values(by='PTS', ascending=False).reset_index()
In [30]:
fig = px.bar(leaderboard.head(20), x='Driver', y='PTS', color='PTS', title='Leaderboard most career points', height=700)
fig.show()
In [31]:
substring = 'Red Bull'
rb = (teams[teams['Team'].str.contains(substring)])
for Team in rb:
    dict = {'Pos' : '', 'Team' : 'Red Bull', 'PTS' : rb['PTS'].sum()}
teams = teams[teams['Team'].str.contains(substring)==False]
teams = teams._append(dict, ignore_index=True)

williams = 'Williams'
w = teams[teams['Team'].str.contains(williams)]
for Team in w:
    dict = {'Pos' : '', 'Team' : 'Williams', 'PTS' : w['PTS'].sum()}
teams = teams[teams['Team'].str.contains(williams)==False]
teams = teams._append(dict, ignore_index=True)

mclaren = 'McLaren'
mc = teams[teams['Team'].str.contains(mclaren)]
for Team in mc:
    dict  = {'Pos' : '', 'Team' : 'McLaren', 'PTS' : mc['PTS'].sum()}
teams = teams[teams['Team'].str.contains(mclaren)==False]
teams = teams._append(dict, ignore_index=True)

leaderboard_tm = teams.groupby('Team').sum('PTS').sort_values(by='PTS', ascending=False).reset_index().head(5)

fig = px.bar(leaderboard_tm, x='Team', y='PTS', color='PTS', title='Leaderboard most Team points', height=700)
fig.show()
In [32]:
leaderboard_win = win.groupby('Winner').size().reset_index(name='Count').sort_values(by='Count', ascending=False).head(10)

fig = px.bar(leaderboard_win, x='Winner', y='Count', color='Count', title='Leaderboard most GP wins by Driver', height=700)
fig.update_yaxes(title_text='GP wins')
fig.show()

Nationalities - Drivers¶

In [34]:
plt.figure(figsize=(10, 6))
drivers['Nationality'].value_counts().plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Distribution of drivers nationalities')
plt.xlabel('Nationalities')
plt.ylabel('Number of drivers')
plt.show()
No description has been provided for this image

Visualizing the Data of Winners¶

In [35]:
plt.figure(figsize=(20,10))
sns.countplot(data = win, x = 'Grand Prix', order = win['Grand Prix'].value_counts().index)
plt.xticks(rotation = 90)
plt.title('Most Grandprix hosted')
plt.ylabel('Number of races')
plt.xlabel('Grand Prix')
plt.show()
No description has been provided for this image
In [36]:
years = win['year'].value_counts()
plt.figure(figsize=(20,10))
sns.lineplot(years)
plt.title('Number of races each year')
plt.xlabel('Years')
plt.ylabel('Number of races')
plt.show()
No description has been provided for this image
In [37]:
plt.figure(figsize=(20,10))
sns.countplot(data = win, x = 'Winner', order=win['Winner'].value_counts().index)
plt.xticks(rotation = 90)
plt.title('Number of races won by racers')
plt.ylabel('Number of races won')
plt.xlabel('Winner name')
plt.show()
No description has been provided for this image
In [38]:
plt.figure(figsize=(20,10))
sns.histplot(data = win, x = 'Average Time per lap', bins = 30)
plt.title('Distribution of average time per lap')
plt.ylabel('Number of races')
plt.xlabel('Seconds')
plt.show()
No description has been provided for this image
In [39]:
top_10_grandprix_by_laps_time =win.groupby('Grand Prix')['Average Time per lap'].mean().sort_values(ascending = False).head(10)
top_10_grandprix_by_laps_time
Out[39]:
Grand Prix
Pescara         597.944444
Germany         250.361329
Switzerland     155.911351
Belgium         151.823593
Morocco         146.320755
Tuscany         141.949153
South Korea     127.118182
Azerbaijan      125.480392
Singapore       116.930485
Saudi Arabia    111.760000
Name: Average Time per lap, dtype: float64
In [40]:
plt.figure(figsize=(20,10))
sns.barplot(x = top_10_grandprix_by_laps_time.index, y = top_10_grandprix_by_laps_time.values)
plt.xticks(rotation = 90)
plt.title('Top 10 Grand Prix by average lap time')
plt.ylabel('Average lap time')
plt.xlabel('Grand Prix')
plt.show()
No description has been provided for this image